Using deep learning and training data created from user clicks


In [258]:
%matplotlib inline
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split

Import training data set from pre-created csv file


In [259]:
dataframe = pd.read_csv("data/auto_training_for_ipy.csv")
train_features = dataframe.ix[:,0:10]
train_labels = dataframe.ix[:,10:11]
train_labels['label'] = train_labels['label'].astype(int)
train_labels['label'] = train_labels['label'].map({-1: 0, 1: 1})

In [260]:
print('features: ', train_features.shape)
print('labels  : ', train_labels.shape)


features:  (2998, 10)
labels  :  (2998, 1)

In [261]:
train_features.head()


Out[261]:
term_score releaseDate_score versionNum_score processingL_score allPop_score monthPop_score userPop_score spatialR_score temporalR_score click_score
0 0.71 -2.780000e+10 2.0 -1 -92 133 -180 109.89 144.0 0
1 1.51 -1.080000e+11 -19.0 1 -314 -99 -892 -389.00 0.0 13
2 1.69 -3.350000e+11 -3.0 0 -27 -47 -402 -108.78 -15108.0 2
3 6.59 9.680000e+09 19.0 1 -8 -73 188 -16.12 5016.0 0
4 2.44 3.970000e+10 -19.0 -1 692 257 567 4.56 -5028.0 -25

In [262]:
train_features.describe()


Out[262]:
term_score releaseDate_score versionNum_score processingL_score allPop_score monthPop_score userPop_score spatialR_score temporalR_score click_score
count 2998.000000 2.998000e+03 2998.000000 2998.000000 2998.000000 2998.000000 2998.000000 2998.000000 2.998000e+03 2998.000000
mean 1.930040 -1.920570e+10 -3.415710 0.101067 -72.448632 -1.579720 -113.866578 -16.695013 -8.413122e+04 3.402935
std 4.392566 2.737896e+11 10.628665 0.935771 475.795673 280.923559 477.643275 1237.701241 6.286458e+05 37.015202
min -14.230000 -2.890000e+12 -20.000000 -2.000000 -998.000000 -712.000000 -997.000000 -27298.000000 -1.839600e+06 -475.000000
25% 0.330000 -1.910000e+11 -16.000000 -1.000000 -430.000000 -196.750000 -481.000000 -20.560000 -6.480000e+02 0.000000
50% 1.740000 -3.890000e+09 0.000000 0.000000 -56.500000 -4.000000 -95.500000 0.000000 0.000000e+00 0.000000
75% 3.247500 3.920000e+10 0.000000 1.000000 242.000000 187.000000 168.000000 15.885000 1.200000e+01 2.000000
max 13.850000 2.990000e+12 20.000000 3.000000 997.000000 727.000000 999.000000 27283.800000 1.839600e+06 555.000000

Distribution of labels


In [263]:
(train_features
     .term_score
     .plot
     .line(lw=0.8))

plt.title('Term score')
plt.xlabel('ID')


Out[263]:
<matplotlib.text.Text at 0x121592748>

In [264]:
train_features.term_score.hist()


Out[264]:
<matplotlib.axes._subplots.AxesSubplot at 0x1215b8208>

In [265]:
print('mean: ', train_features.mean()[0])
print('var :', train_features.var()[0])


mean:  1.93004002668
var : 19.2946365016

To get a better understanding of the problem domain, take a look at the correlation matrix


In [266]:
import seaborn as sns
correlations = train_features.corr()
corr_heat = sns.heatmap(correlations)
plt.title('Ranking feature Correlations')


Out[266]:
<matplotlib.text.Text at 0x121a285f8>

In [267]:
(correlations
     .term_score
     .drop('term_score') # don't compare with myself
     .sort_values(ascending=False)
     .plot
     .barh())


Out[267]:
<matplotlib.axes._subplots.AxesSubplot at 0x1219a9fd0>

Import human labelled testing set


In [268]:
dataframe = pd.read_csv("data/humanlabelled_for_ipy.csv")
test_features = dataframe.ix[:,0:10]
test_labels = dataframe.ix[:,10:11]
test_labels['label'] = test_labels['label'].astype(int)
test_labels['label'] = test_labels['label'].map({-1: 0, 1: 1})

In [269]:
from sklearn import preprocessing
# Train data with DL model
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.utils import plot_model
from keras.utils import np_utils

Data scaling


In [270]:
scaler = preprocessing.StandardScaler().fit(train_features)
train_features = scaler.transform(train_features)
test_features =  scaler.transform(test_features)
train_labels = np_utils.to_categorical(train_labels, num_classes=2)
test_labels = np_utils.to_categorical(test_labels, num_classes=2)

Build a sequential NN model


In [271]:
model = Sequential()
model.add(Dense(8, input_dim=10, activation='relu'))
model.add(Dense(6, activation='relu'))
model.add(Dense(4, activation='relu'))
# model.add(Dropout(0.5))
model.add(Dense(2, activation='softmax'))
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

Batch processing


In [272]:
# train_loss_and_metrics = model.fit(train_features, train_labels, epochs = 5, batch_size = 32)
# print(train_loss_and_metrics)
# test_loss_and_metrics = model.evaluate(test_features, test_labels, batch_size=128)
# print(test_loss_and_metrics)

Online learning


In [ ]:
test_accuracy = [];
test_loss = [];
train_accuracy = [];
train_loss = [];

increment = 64
chunks_train_data = [train_features[x:x+increment] for x in range(0, len(train_features), increment)]
chunks_train_labels = [train_labels[x:x+increment] for x in range(0, len(train_features), increment)]

for epoch in range(0, 5):
    for i, el in enumerate(chunks_train_data):
        print(i)
        train_loss_and_metrics = model.train_on_batch(el, chunks_train_labels[i])
        #print(train_loss_and_metrics)
        train_loss.append(train_loss_and_metrics[0])
        train_accuracy.append(train_loss_and_metrics[1])
        test_loss_and_metrics = model.evaluate(test_features, test_labels, batch_size=128)
        #print(test_loss_and_metrics)
        test_loss.append(test_loss_and_metrics[0])
        test_accuracy.append(test_loss_and_metrics[1])

Visualize the learning curve


In [274]:
fig = plt.figure()

ax1 = fig.add_subplot(211)
ax1.plot(train_loss)
ax1.plot(test_loss)
ax1.set_ylabel('Loss')
ax1.set_xlabel('Iteration')
ax1.legend(['training', 'testing'], loc='upper left')

ax2 = fig.add_subplot(212)
ax2.plot(train_accuracy)
ax2.plot(test_accuracy)
ax2.set_ylabel('Accuracy')
# ax2.set_xlabel('Iteration')
ax2.legend(['training', 'testing'], loc='upper left')


Out[274]:
<matplotlib.legend.Legend at 0x11f7f2e80>

In [ ]: